Introduction

Introduction text

For an interactive app to visualize this data, please see: Shiny App

Data Description

A description of the data, where it came from, and what it contains.

Read the Data

train <- read.csv("Data/train.csv")
test <- read.csv("Data/test.csv")

# Merge the data frames and add a column indicating whether they come from the train or test set
train$train <- 1
test$SalePrice <- NA
test$train <- 0
ames <- rbind(train, test)

# Verify data frame
head(ames)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice train
## 1       0      2   2008       WD        Normal    208500     1
## 2       0      5   2007       WD        Normal    181500     1
## 3       0      9   2008       WD        Normal    223500     1
## 4       0      2   2006       WD       Abnorml    140000     1
## 5       0     12   2008       WD        Normal    250000     1
## 6     700     10   2009       WD        Normal    143000     1
str(ames)
## 'data.frame':    2919 obs. of  82 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  $ train        : num  1 1 1 1 1 1 1 1 1 1 ...
summary(ames)
##        Id           MSSubClass       MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.00   Length:2919        Min.   : 21.00  
##  1st Qu.: 730.5   1st Qu.: 20.00   Class :character   1st Qu.: 59.00  
##  Median :1460.0   Median : 50.00   Mode  :character   Median : 68.00  
##  Mean   :1460.0   Mean   : 57.14                      Mean   : 69.31  
##  3rd Qu.:2189.5   3rd Qu.: 70.00                      3rd Qu.: 80.00  
##  Max.   :2919.0   Max.   :190.00                      Max.   :313.00  
##                                                       NA's   :486     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:2919        Length:2919        Length:2919       
##  1st Qu.:  7478   Class :character   Class :character   Class :character  
##  Median :  9453   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10168                                                           
##  3rd Qu.: 11570                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:2919        Length:2919        Length:2919        Length:2919       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:2919        Length:2919        Length:2919        Length:2919       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:2919        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.089   Mean   :5.565   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2001  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:2919        Length:2919        Length:2919       
##  1st Qu.:1965   Class :character   Class :character   Class :character  
##  Median :1993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1984                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:2919        Length:2919        Min.   :   0.0   Length:2919       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 102.2                     
##                                        3rd Qu.: 164.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :23                         
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:2919        Length:2919        Length:2919        Length:2919       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:2919        Length:2919        Min.   :   0.0   Length:2919       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 368.5   Mode  :character  
##                                        Mean   : 441.4                     
##                                        3rd Qu.: 733.0                     
##                                        Max.   :5644.0                     
##                                        NA's   :1                          
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:2919       
##  1st Qu.:   0.00   1st Qu.: 220.0   1st Qu.: 793.0   Class :character  
##  Median :   0.00   Median : 467.0   Median : 989.5   Mode  :character  
##  Mean   :  49.58   Mean   : 560.8   Mean   :1051.8                     
##  3rd Qu.:   0.00   3rd Qu.: 805.5   3rd Qu.:1302.0                     
##  Max.   :1526.00   Max.   :2336.0   Max.   :6110.0                     
##  NA's   :1         NA's   :1        NA's   :1                          
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:2919        Length:2919        Length:2919        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 876  
##  Mode  :character   Mode  :character   Mode  :character   Median :1082  
##                                                           Mean   :1160  
##                                                           3rd Qu.:1388  
##                                                           Max.   :5095  
##                                                                         
##    X2ndFlrSF       LowQualFinSF        GrLivArea     BsmtFullBath   
##  Min.   :   0.0   Min.   :   0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0.0   1st Qu.:   0.000   1st Qu.:1126   1st Qu.:0.0000  
##  Median :   0.0   Median :   0.000   Median :1444   Median :0.0000  
##  Mean   : 336.5   Mean   :   4.694   Mean   :1501   Mean   :0.4299  
##  3rd Qu.: 704.0   3rd Qu.:   0.000   3rd Qu.:1744   3rd Qu.:1.0000  
##  Max.   :2065.0   Max.   :1064.000   Max.   :5642   Max.   :3.0000  
##                                                     NA's   :2       
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr 
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.00  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.00  
##  Mean   :0.06136   Mean   :1.568   Mean   :0.3803   Mean   :2.86  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.00  
##  Max.   :2.00000   Max.   :4.000   Max.   :2.0000   Max.   :8.00  
##  NA's   :2                                                        
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:2919        Min.   : 2.000   Length:2919       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.045                      Mean   : 6.452                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :15.000                     
##                                                                        
##    Fireplaces     FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.0000   Length:2919        Length:2919        Min.   :1895  
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:1960  
##  Median :1.0000   Mode  :character   Mode  :character   Median :1979  
##  Mean   :0.5971                                         Mean   :1978  
##  3rd Qu.:1.0000                                         3rd Qu.:2002  
##  Max.   :4.0000                                         Max.   :2207  
##                                                         NA's   :159   
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:2919        Min.   :0.000   Min.   :   0.0   Length:2919       
##  Class :character   1st Qu.:1.000   1st Qu.: 320.0   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 472.9                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :5.000   Max.   :1488.0                     
##                     NA's   :1       NA's   :1                          
##   GarageCond         PavedDrive          WoodDeckSF       OpenPorchSF    
##  Length:2919        Length:2919        Min.   :   0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:   0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :   0.00   Median : 26.00  
##                                        Mean   :  93.71   Mean   : 47.49  
##                                        3rd Qu.: 168.00   3rd Qu.: 70.00  
##                                        Max.   :1424.00   Max.   :742.00  
##                                                                          
##  EnclosedPorch      X3SsnPorch       ScreenPorch        PoolArea      
##  Min.   :   0.0   Min.   :  0.000   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:   0.0   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :   0.0   Median :  0.000   Median :  0.00   Median :  0.000  
##  Mean   :  23.1   Mean   :  2.602   Mean   : 16.06   Mean   :  2.252  
##  3rd Qu.:   0.0   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :1012.0   Max.   :508.000   Max.   :576.00   Max.   :800.000  
##                                                                       
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:2919        Length:2919        Length:2919        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   50.83  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :17000.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:2919        Length:2919       
##  1st Qu.: 4.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.213   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice          train       
##  Min.   : 34900   Min.   :0.0000  
##  1st Qu.:129975   1st Qu.:0.0000  
##  Median :163000   Median :1.0000  
##  Mean   :180921   Mean   :0.5002  
##  3rd Qu.:214000   3rd Qu.:1.0000  
##  Max.   :755000   Max.   :1.0000  
##  NA's   :1459

For data cleaning purposes, we will merge test and train into one dataset, keeping in mind that the 1459 NA’s in the SalePrice column are from the test set. We will also add a column to indicate whether the row is from the train or test set.

Data Exploration

Next, we will start exploring the data for insights into the Ames housing market as well as what variables we need to change, clean, or create.

First, we will visualize where the NA values are and whether it will affect the first to parameters we are concerned about: Sale Price and Gross Living Area.

# Summarize NA's by  column
ames %>%
  summarise_all(funs(sum(is.na(.)))) %>%
  gather(key = "Column", value = "NA_Count", -1) %>%
  filter(NA_Count > 0) %>%
  ggplot(aes(x = reorder(Column, NA_Count), y = NA_Count)) +
  geom_col() +
  coord_flip() +
  theme_gdocs() +
  labs(title = "Number of NA's by Column", x = "Column", y = "NA Count")

# Create a table of the missing NAs by column
ames %>%
  summarise_all(funs(sum(is.na(.)))) %>%
  gather(key = "Column", value = "NA_Count", -1) %>%
  filter(NA_Count > 0) %>%
  arrange(desc(NA_Count)) %>%
  kable()
Id Column NA_Count
0 PoolQC 2909
0 MiscFeature 2814
0 Alley 2721
0 Fence 2348
0 SalePrice 1459
0 FireplaceQu 1420
0 LotFrontage 486
0 GarageYrBlt 159
0 GarageFinish 159
0 GarageQual 159
0 GarageCond 159
0 GarageType 157
0 BsmtCond 82
0 BsmtExposure 82
0 BsmtQual 81
0 BsmtFinType2 80
0 BsmtFinType1 79
0 MasVnrType 24
0 MasVnrArea 23
0 MSZoning 4
0 Utilities 2
0 BsmtFullBath 2
0 BsmtHalfBath 2
0 Functional 2
0 Exterior1st 1
0 Exterior2nd 1
0 BsmtFinSF1 1
0 BsmtFinSF2 1
0 BsmtUnfSF 1
0 TotalBsmtSF 1
0 Electrical 1
0 KitchenQual 1
0 GarageCars 1
0 GarageArea 1
0 SaleType 1

There are not too many NA’s in the data set, and they appear mostly to do with lack of a certain feature. For example, if a house does not have a pool, then the PoolQC column will be NA. We will need to decide how to handle these NA’s, but for now we will leave them as is and continue with the analysis of Sale Price and Gross Living Area.

Analysis 1: Sale Price and Gross Living Area

Restate the problem here

Build and fit the model

Entire Dataset

# Plot Sale Price vs. Gross Living Area colored by neighborhood, omitting rows where SalePrice is NA
ames %>%
  filter(!is.na(SalePrice)) %>%
  ggplot(aes(x = GrLivArea, y = SalePrice, color = Neighborhood)) +
  geom_point() +
  theme_gdocs() +
  labs(title = "Sale Price vs. Gross Living Area by Neighborhood", x = "Gross Living Area", y = "Sale Price")

There is clearly a relationship between Sale Price and Gross Living Area, and the neighborhoods appear to have a similar relationship. We will now look at log transformations of the data to see if there is more linear relationship.

# Plot log(Sale Price) vs. log(Gross Living Area) colored by neighborhood, omitting rows where SalePrice is NA
ames %>%
  filter(!is.na(SalePrice)) %>%
  ggplot(aes(x = log(GrLivArea), y = log(SalePrice), color = Neighborhood)) +
  geom_point() +
  theme_gdocs() +
  labs(
    title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
    x = "log(Gross Living Area)",
    y = "log(Sale Price)"
  )

This relationship appears to be more linear. We will create columns for the log of Sale Price and Gross Living Area and use these in our analysis.

# Create columns for log(SalePrice) and log(GrLivArea)
ames$logSalePrice <- log(ames$SalePrice)
ames$logGrLivArea <- log(ames$GrLivArea)

PRESS <- function(linear.model) {
  #' calculate the predictive residuals
  pr <- residuals(linear.model) / (1 - lm.influence(linear.model)$hat)
  #' calculate the PRESS
  PRESS <- sum(pr^2)

  return(PRESS)
}
# Function for calculating PRESS
# Tom Hopper
# https://gist.github.com/tomhopper/8c204d978c4a0cbcb8c0

^Just some code for CV PRESS and log transformations. Jae: What do you think about using AIC rather than PRESS? The models are optimized using AIC.

Century 21 Area

Next, we will visualize the relationship between log Sale Price and log Gross Living Area for the neighborhoods that Century21 operates in: NAmes, Edwards and BrkSide.

# Plot log(Sale Price) vs. log(Gross Living Area) colored by neighborhood, omitting rows where SalePrice is NA for only the neighborhoods of interest
century21 <-
  ames %>%
  filter(!is.na(SalePrice)) %>%
  filter(Neighborhood %in% c("NAmes", "Edwards", "BrkSide")) 
century21 %>%
  ggplot(aes(x = logGrLivArea, y = logSalePrice, color = Neighborhood)) +
  geom_point() +
  theme_gdocs() +
  labs(
    title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
    x = "log(Gross Living Area)",
    y = "log(Sale Price)"
  )

The relationship appears to be linear, so we will fit a linear model using this data and asses whether it describes the Sale Prices accurately.

# Fit a linear model to the data
fit1x <- lm(logSalePrice ~ logGrLivArea + Neighborhood, data = century21)
summary(fit1x)
## 
## Call:
## lm(formula = logSalePrice ~ logGrLivArea + Neighborhood, data = century21)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72154 -0.10592  0.02469  0.11565  0.79364 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.76936    0.22919  33.900  < 2e-16 ***
## logGrLivArea         0.55579    0.03237  17.171  < 2e-16 ***
## NeighborhoodEdwards -0.02044    0.03252  -0.629     0.53    
## NeighborhoodNAmes    0.13279    0.02906   4.569 6.63e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1961 on 379 degrees of freedom
## Multiple R-squared:  0.4897, Adjusted R-squared:  0.4857 
## F-statistic: 121.2 on 3 and 379 DF,  p-value: < 2.2e-16
PRESS(fit1x)
## [1] 15.00131
# Fit a linear model to the data with interaction variables
fit1 <- lm(logSalePrice ~ logGrLivArea * Neighborhood, data = century21)
summary(fit1)
## 
## Call:
## lm(formula = logSalePrice ~ logGrLivArea * Neighborhood, data = century21)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72080 -0.10353  0.02184  0.10586  0.80470 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       5.91292    0.50459  11.718  < 2e-16 ***
## logGrLivArea                      0.81965    0.07163  11.443  < 2e-16 ***
## NeighborhoodEdwards               2.09359    0.64589   3.241   0.0013 ** 
## NeighborhoodNAmes                 2.57981    0.59988   4.301 2.17e-05 ***
## logGrLivArea:NeighborhoodEdwards -0.29998    0.09122  -3.289   0.0011 ** 
## logGrLivArea:NeighborhoodNAmes   -0.34662    0.08482  -4.087 5.35e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1923 on 377 degrees of freedom
## Multiple R-squared:  0.5121, Adjusted R-squared:  0.5056 
## F-statistic: 79.14 on 5 and 377 DF,  p-value: < 2.2e-16
PRESS(fit1)
## [1] 14.60908
confint(fit1) %>% kable()
2.5 % 97.5 %
(Intercept) 4.9207572 6.9050843
logGrLivArea 0.6788064 0.9604897
NeighborhoodEdwards 0.8235795 3.3635933
NeighborhoodNAmes 1.4002744 3.7593394
logGrLivArea:NeighborhoodEdwards -0.4793353 -0.1206263
logGrLivArea:NeighborhoodNAmes -0.5134042 -0.1798447
# Plot the data with the linear model superposed
century21 %>%
  ggplot(aes(x = logGrLivArea, y = logSalePrice, color = Neighborhood)) +
  geom_point() +
  theme_gdocs() +
  labs(
    title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
    x = "log(Gross Living Area)",
    y = "log(Sale Price)"
  ) +
  geom_smooth(
    method = "lm", formula = y ~ x, se = FALSE, linewidth = 1,
    data = data.frame(
      logGrLivArea = century21$logGrLivArea,
      Neighborhood = century21$Neighborhood,
      logSalePrice = predict(fit1)
    )
  )

# # Print parameter estimate table nicely. Not working, needs debugging
# fit1 %>%
#   summary() %>%
#   {cbind(as.data.frame(coef(.)), .[["coefficients"]][, 2:4])} %>%
#   setNames(c("Estimate", "Std. Error", "t-value", "Pr(>|t|)")) %>%
#   rownames_to_column(var = "Term") %>%
#   mutate(Term = ifelse(Term == "(Intercept)", "Intercept", Term)) %>%
#   add_row(Term = "Adjusted R-squared", Estimate = round(.$adj.r.squared, 3), Std..Error = NA, `t-value` = NA, `Pr(>|t|)` = NA) %>%
#   kable(digits = 3, align = "c") %>%
#   kable_styling(full_width = FALSE)

We fit models with and without interaction terms. The interaction terms are statistically significant (include p values) and the model with interaction terms has a lower PRESS. This means that the relationship between log sale price and log gross living area is different for each neighborhood.

It might be good to test whether Edwards and NAmes are are statistically different. We could do that by re-referencing or doing the BYOA method.

Check the Assumptions

Residual Plots

# Plot the studentized residuals using base R
plot(fit1$fitted.values, fit1$residuals, type = "p")

plot(fit1$residuals)

# Decide which of these we like better ggplot or R

# Calculate studentized residuals
stud_res <- rstudent(fit1)
# Create a data frame with the studentized residuals
df <- data.frame(stud_res, logGrLivArea = model.frame(fit1)$logGrLivArea)

# Create a scatterplot of the studentized residuals
ggplot(df, aes(x = logGrLivArea, y = stud_res)) +
  geom_point() +
  labs(title = "Scatterplot of Studentized Residuals",
  x = "Studentized Residuals",
  y = "Frequency") +
  theme_minimal()

# Create histogram with normal curve
ggplot(df, aes(x = stud_res)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.5, fill = "lightblue", color = "black") +
  stat_function(fun = dnorm, args = list(mean = mean(df$stud_res), sd = sd(df$stud_res)), color = "blue", size = 1.2) +
  labs(title = "Histogram of Studentized Residuals with Normal Curve",
  x = "Studentized Residuals",
  y = "Frequency") +
  theme_minimal()

The plots show the same thing, please include whichever style you think looks better. There is no evidence of non-linearity, heteroscedasticity, or non-normality.

Influential Points

# Plot the residuals vs. the fitted values
fit1 %>%
  plot()

There are no influential points that need to be investigated. The results are consistent with the assumptions of the linear model.

Comparing competing models

This is already done, just need to move it here. Adj R2, CV press

Parameters (linear model)

Estimates, interpretation, confidence

Conclusion

A short summary of the analysis

RShiny App - Sale price and gross living area

Either embed the app here or link to it

Analysis 2: Sale Price

Restate the problem

Data Cleaning

In order to use a linear regression model, we need to convert all of the categorical variables into dummy variables. We will also remove or impute the NA’s in the data set. Please see the appendix for details on this process.

# Data Cleaning

# If pool-related variables are NA, assume there is no pool and assign to 0
ames <- ames %>%
  mutate(
    PoolQC = ifelse(is.na(PoolQC), "None", PoolQC),
    PoolArea = ifelse(is.na(PoolArea), 0, PoolArea),
  )
# If garage-related variables are NA, assume there is no garage and assign to 0
ames <- ames %>%
  mutate(
    GarageType = ifelse(is.na(GarageType), "None", GarageType),
    #GarageYrBlt = ifelse(is.na(GarageYrBlt), 0, GarageYrBlt), #These will be changed to the mean because of large year values
    GarageFinish = ifelse(is.na(GarageFinish), "None", GarageFinish),
    GarageCars = ifelse(is.na(GarageCars), 0, GarageCars),
    GarageArea = ifelse(is.na(GarageArea), 0, GarageArea),
    GarageQual = ifelse(is.na(GarageQual), "None", GarageQual),
    GarageCond = ifelse(is.na(GarageCond), "None", GarageCond)
  )
# If Bsmt-related variables are NA, assume there is no Bsmt and assign to 0
ames <- ames %>%
  mutate(
    BsmtQual = ifelse(is.na(BsmtQual), "None", BsmtQual),
    BsmtCond = ifelse(is.na(BsmtCond), "None", BsmtCond),
    BsmtExposure = ifelse(is.na(BsmtExposure), "None", BsmtExposure),
    BsmtFinType1 = ifelse(is.na(BsmtFinType1), "None", BsmtFinType1),
    BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), 0, BsmtFinSF1),
    BsmtFinType2 = ifelse(is.na(BsmtFinType2), "None", BsmtFinType2),
    BsmtFinSF2 = ifelse(is.na(BsmtFinSF2), 0, BsmtFinSF2),
    BsmtUnfSF = ifelse(is.na(BsmtUnfSF), 0, BsmtUnfSF),
    TotalBsmtSF = ifelse(is.na(TotalBsmtSF), 0, TotalBsmtSF)
  )
# If Fence-related variables are NA, assume there is no Fence and assign to 0
ames <- ames %>%
  mutate(
    Fence = ifelse(is.na(Fence), "None", Fence), 
  )
# If Misc-related variables are NA, assume there is no Misc features and assign to 0
ames <- ames %>%
  mutate(
    MiscFeature = ifelse(is.na(MiscFeature), "None", MiscFeature), 
  )
# If Fireplace-related variables are NA, assume there is no Fireplace and assign to 0
ames <- ames %>%
  mutate(
    FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
  )
# If Alley-related variables are NA, assume there is no Alley and assign to 0
ames <- ames %>%
  mutate(
    Alley = ifelse(is.na(Alley), "None", Alley),
  )

# Summarize the amount of remaining NA's by column to check what's left
colSums(is.na(ames))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             4           486             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0             0             0             0             2 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             1             1 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##            24            23             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##             0             0             0             0             0 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##             0             0             0             0             0 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             2             2             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             1             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             2             0             0             0           159 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##             0             0             0             0             0 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0             0             0             0 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             1             0 
##     SalePrice         train  logSalePrice  logGrLivArea 
##          1459             0          1459             0
# Use the dummyVars() function to convert categorical variables into dummy variables
# Then use janitor::clean_names() to clean up the column names
dummy_model <- dummyVars(~ ., data = ames)
ames_dummy <- as.data.frame(predict(dummy_model, newdata = ames))
ames_dummy <- clean_names(ames_dummy)

# Fill in all remaining na values with the mean of the column
ames_dummy <- ames_dummy %>%
  mutate(across(
    c(-sale_price, -log_sale_price),
    ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)
  ))
# Summarize the amount of remaining NA's by column
colSums(is.na(ames_dummy))
##                      id            ms_sub_class         ms_zoning_c_all 
##                       0                       0                       0 
##            ms_zoning_fv            ms_zoning_rh            ms_zoning_rl 
##                       0                       0                       0 
##            ms_zoning_rm            lot_frontage                lot_area 
##                       0                       0                       0 
##             street_grvl             street_pave              alley_grvl 
##                       0                       0                       0 
##              alley_none              alley_pave           lot_shape_ir1 
##                       0                       0                       0 
##           lot_shape_ir2           lot_shape_ir3           lot_shape_reg 
##                       0                       0                       0 
##        land_contour_bnk        land_contour_hls        land_contour_low 
##                       0                       0                       0 
##        land_contour_lvl       utilities_all_pub      utilities_no_se_wa 
##                       0                       0                       0 
##       lot_config_corner    lot_config_cul_d_sac          lot_config_fr2 
##                       0                       0                       0 
##          lot_config_fr3       lot_config_inside          land_slope_gtl 
##                       0                       0                       0 
##          land_slope_mod          land_slope_sev    neighborhood_blmngtn 
##                       0                       0                       0 
##    neighborhood_blueste    neighborhood_br_dale   neighborhood_brk_side 
##                       0                       0                       0 
##   neighborhood_clear_cr   neighborhood_collg_cr    neighborhood_crawfor 
##                       0                       0                       0 
##    neighborhood_edwards    neighborhood_gilbert     neighborhood_idotrr 
##                       0                       0                       0 
##   neighborhood_meadow_v    neighborhood_mitchel     neighborhood_n_ames 
##                       0                       0                       0 
##   neighborhood_no_ridge  neighborhood_n_pk_vill   neighborhood_nridg_ht 
##                       0                       0                       0 
##    neighborhood_nw_ames   neighborhood_old_town     neighborhood_sawyer 
##                       0                       0                       0 
##   neighborhood_sawyer_w    neighborhood_somerst   neighborhood_stone_br 
##                       0                       0                       0 
##      neighborhood_swisu     neighborhood_timber    neighborhood_veenker 
##                       0                       0                       0 
##        condition1artery         condition1feedr          condition1norm 
##                       0                       0                       0 
##         condition1pos_a         condition1pos_n         condition1rr_ae 
##                       0                       0                       0 
##         condition1rr_an         condition1rr_ne         condition1rr_nn 
##                       0                       0                       0 
##        condition2artery         condition2feedr          condition2norm 
##                       0                       0                       0 
##         condition2pos_a         condition2pos_n         condition2rr_ae 
##                       0                       0                       0 
##         condition2rr_an         condition2rr_nn           bldg_type1fam 
##                       0                       0                       0 
##        bldg_type2fm_con        bldg_type_duplex         bldg_type_twnhs 
##                       0                       0                       0 
##       bldg_type_twnhs_e       house_style1_5fin       house_style1_5unf 
##                       0                       0                       0 
##       house_style1story       house_style2_5fin       house_style2_5unf 
##                       0                       0                       0 
##       house_style2story     house_style_s_foyer       house_style_s_lvl 
##                       0                       0                       0 
##            overall_qual            overall_cond              year_built 
##                       0                       0                       0 
##          year_remod_add         roof_style_flat        roof_style_gable 
##                       0                       0                       0 
##      roof_style_gambrel          roof_style_hip      roof_style_mansard 
##                       0                       0                       0 
##         roof_style_shed      roof_matl_cly_tile      roof_matl_comp_shg 
##                       0                       0                       0 
##       roof_matl_membran         roof_matl_metal          roof_matl_roll 
##                       0                       0                       0 
##       roof_matl_tar_grv      roof_matl_wd_shake      roof_matl_wd_shngl 
##                       0                       0                       0 
##    exterior1st_asb_shng    exterior1st_asph_shn    exterior1st_brk_comm 
##                       0                       0                       0 
##    exterior1st_brk_face     exterior1st_c_block    exterior1st_cemnt_bd 
##                       0                       0                       0 
##    exterior1st_hd_board    exterior1st_im_stucc    exterior1st_metal_sd 
##                       0                       0                       0 
##     exterior1st_plywood       exterior1st_stone      exterior1st_stucco 
##                       0                       0                       0 
##    exterior1st_vinyl_sd     exterior1st_wd_sdng    exterior1st_wd_shing 
##                       0                       0                       0 
##    exterior2nd_asb_shng    exterior2nd_asph_shn     exterior2nd_brk_cmn 
##                       0                       0                       0 
##    exterior2nd_brk_face     exterior2nd_c_block    exterior2nd_cment_bd 
##                       0                       0                       0 
##    exterior2nd_hd_board    exterior2nd_im_stucc    exterior2nd_metal_sd 
##                       0                       0                       0 
##       exterior2nd_other     exterior2nd_plywood       exterior2nd_stone 
##                       0                       0                       0 
##      exterior2nd_stucco    exterior2nd_vinyl_sd     exterior2nd_wd_sdng 
##                       0                       0                       0 
##     exterior2nd_wd_shng    mas_vnr_type_brk_cmn   mas_vnr_type_brk_face 
##                       0                       0                       0 
##       mas_vnr_type_none      mas_vnr_type_stone            mas_vnr_area 
##                       0                       0                       0 
##           exter_qual_ex           exter_qual_fa           exter_qual_gd 
##                       0                       0                       0 
##           exter_qual_ta           exter_cond_ex           exter_cond_fa 
##                       0                       0                       0 
##           exter_cond_gd           exter_cond_po           exter_cond_ta 
##                       0                       0                       0 
##      foundation_brk_til      foundation_c_block       foundation_p_conc 
##                       0                       0                       0 
##         foundation_slab        foundation_stone         foundation_wood 
##                       0                       0                       0 
##            bsmt_qual_ex            bsmt_qual_fa            bsmt_qual_gd 
##                       0                       0                       0 
##          bsmt_qual_none            bsmt_qual_ta            bsmt_cond_fa 
##                       0                       0                       0 
##            bsmt_cond_gd          bsmt_cond_none            bsmt_cond_po 
##                       0                       0                       0 
##            bsmt_cond_ta        bsmt_exposure_av        bsmt_exposure_gd 
##                       0                       0                       0 
##        bsmt_exposure_mn        bsmt_exposure_no      bsmt_exposure_none 
##                       0                       0                       0 
##       bsmt_fin_type1alq       bsmt_fin_type1blq       bsmt_fin_type1glq 
##                       0                       0                       0 
##      bsmt_fin_type1lw_q      bsmt_fin_type1none       bsmt_fin_type1rec 
##                       0                       0                       0 
##       bsmt_fin_type1unf            bsmt_fin_sf1       bsmt_fin_type2alq 
##                       0                       0                       0 
##       bsmt_fin_type2blq       bsmt_fin_type2glq      bsmt_fin_type2lw_q 
##                       0                       0                       0 
##      bsmt_fin_type2none       bsmt_fin_type2rec       bsmt_fin_type2unf 
##                       0                       0                       0 
##            bsmt_fin_sf2             bsmt_unf_sf           total_bsmt_sf 
##                       0                       0                       0 
##           heating_floor           heating_gas_a           heating_gas_w 
##                       0                       0                       0 
##            heating_grav           heating_oth_w            heating_wall 
##                       0                       0                       0 
##           heating_qc_ex           heating_qc_fa           heating_qc_gd 
##                       0                       0                       0 
##           heating_qc_po            heating_qcta           central_air_n 
##                       0                       0                       0 
##           central_air_y       electrical_fuse_a       electrical_fuse_f 
##                       0                       0                       0 
##       electrical_fuse_p          electrical_mix       electrical_s_brkr 
##                       0                       0                       0 
##             x1st_flr_sf             x2nd_flr_sf         low_qual_fin_sf 
##                       0                       0                       0 
##             gr_liv_area          bsmt_full_bath          bsmt_half_bath 
##                       0                       0                       0 
##               full_bath               half_bath          bedroom_abv_gr 
##                       0                       0                       0 
##          kitchen_abv_gr         kitchen_qual_ex         kitchen_qual_fa 
##                       0                       0                       0 
##         kitchen_qual_gd         kitchen_qual_ta         tot_rms_abv_grd 
##                       0                       0                       0 
##         functional_maj1         functional_maj2         functional_min1 
##                       0                       0                       0 
##         functional_min2          functional_mod          functional_sev 
##                       0                       0                       0 
##          functional_typ              fireplaces         fireplace_qu_ex 
##                       0                       0                       0 
##         fireplace_qu_fa         fireplace_qu_gd       fireplace_qu_none 
##                       0                       0                       0 
##         fireplace_qu_po         fireplace_qu_ta       garage_type2types 
##                       0                       0                       0 
##      garage_type_attchd     garage_type_basment    garage_type_built_in 
##                       0                       0                       0 
##    garage_type_car_port      garage_type_detchd        garage_type_none 
##                       0                       0                       0 
##           garage_yr_blt       garage_finish_fin      garage_finish_none 
##                       0                       0                       0 
##      garage_finish_r_fn       garage_finish_unf             garage_cars 
##                       0                       0                       0 
##             garage_area          garage_qual_ex          garage_qual_fa 
##                       0                       0                       0 
##          garage_qual_gd        garage_qual_none          garage_qual_po 
##                       0                       0                       0 
##          garage_qual_ta          garage_cond_ex          garage_cond_fa 
##                       0                       0                       0 
##          garage_cond_gd        garage_cond_none          garage_cond_po 
##                       0                       0                       0 
##          garage_cond_ta           paved_drive_n           paved_drive_p 
##                       0                       0                       0 
##           paved_drive_y            wood_deck_sf           open_porch_sf 
##                       0                       0                       0 
##          enclosed_porch             x3ssn_porch            screen_porch 
##                       0                       0                       0 
##               pool_area              pool_qc_ex              pool_qc_fa 
##                       0                       0                       0 
##              pool_qc_gd            pool_qc_none            fence_gd_prv 
##                       0                       0                       0 
##             fence_gd_wo            fence_mn_prv             fence_mn_ww 
##                       0                       0                       0 
##              fence_none       misc_feature_gar2       misc_feature_none 
##                       0                       0                       0 
##       misc_feature_othr       misc_feature_shed      misc_feature_ten_c 
##                       0                       0                       0 
##                misc_val                 mo_sold                 yr_sold 
##                       0                       0                       0 
##           sale_type_cod           sale_type_con        sale_type_con_ld 
##                       0                       0                       0 
##        sale_type_con_li        sale_type_con_lw           sale_type_cwd 
##                       0                       0                       0 
##           sale_type_new           sale_type_oth            sale_type_wd 
##                       0                       0                       0 
##  sale_condition_abnorml sale_condition_adj_land   sale_condition_alloca 
##                       0                       0                       0 
##   sale_condition_family   sale_condition_normal  sale_condition_partial 
##                       0                       0                       0 
##              sale_price                   train          log_sale_price 
##                    1459                       0                    1459 
##         log_gr_liv_area 
##                       0
# Split the data into training and testing sets
train <- ames_dummy[ames_dummy$train == 1, ]
test <- ames_dummy[ames_dummy$train == 0, ]

We can write as much or as little about this as we’d like. Basically, we tried to interpret what the NA’s actually were saying and then make a more informed guess as to what the were rather than going with the mean. In most cases, this created a new category of “none”. Remaining values were imputed to the mean, and the data was split into training and testing sets.

Model Selection

talk about feature selection methods

Forward Selection

# Forward Selection

# # Testing olsrr method.
# library(olsrr)
# fit2x <- lm(log_sale_price ~ . - sale_price, data = train)
# fit2y <- ols_step_forward_p(fit2x, penter = 0.15)$model
# summary(fit2y)
# defaultSummary(data.frame(pred = predict(fit2y), obs = train$log_sale_price))
# PRESS(fit2y)

# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_forwards.rds")) {
  # Load the model object from disk
  fit2 <- readRDS("Models/lm_forwards.rds")
} else {
  # Perform stepwise selection

  # Set up a parallel backend with the number of cores you want to use
  cores <- 8 # Change this to the number of cores you want to use
  cl <- makePSOCKcluster(cores)
  registerDoParallel(cl)

  set.seed(137)
  ctrl <- trainControl(
    method = "boot",
    number = 5,
    allowParallel = TRUE
  )
  fit2 <- train(log_sale_price ~ . - sale_price,
    data = train,
    method = "glmStepAIC",
    trControl = ctrl,
    direction = "forward",
    penter = 0.05 # Not Working.
  )

  # Stop the parallel backend
  stopCluster(cl)

  # Save the model object to disk
  saveRDS(fit2, "Models/lm_forwards.rds")
}

summary(fit2$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.71103  -0.04529   0.00088   0.05269   0.60423  
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.575e+00  5.641e-01   4.565 5.45e-06 ***
## overall_qual            3.809e-02  3.940e-03   9.668  < 2e-16 ***
## log_gr_liv_area         3.724e-01  4.096e-02   9.091  < 2e-16 ***
## year_built              2.109e-03  2.175e-04   9.699  < 2e-16 ***
## overall_cond            4.115e-02  3.341e-03  12.317  < 2e-16 ***
## total_bsmt_sf           1.388e-04  1.218e-05  11.389  < 2e-16 ***
## roof_matl_cly_tile     -2.536e+00  1.522e-01 -16.671  < 2e-16 ***
## bsmt_unf_sf            -5.867e-05  1.210e-05  -4.848 1.39e-06 ***
## garage_cars             2.679e-02  8.629e-03   3.104 0.001946 ** 
## bldg_type1fam           2.699e-02  1.062e-02   2.541 0.011163 *  
## condition2pos_n        -7.879e-01  7.940e-02  -9.923  < 2e-16 ***
## neighborhood_crawfor    1.269e-01  1.617e-02   7.851 8.32e-15 ***
## kitchen_qual_ex         7.455e-02  1.341e-02   5.561 3.22e-08 ***
## ms_zoning_c_all        -4.215e-01  3.485e-02 -12.096  < 2e-16 ***
## lot_area                2.766e-06  3.941e-07   7.019 3.52e-12 ***
## sale_condition_abnorml -2.802e-02  1.967e-02  -1.424 0.154564    
## heating_qc_ex           2.760e-02  7.032e-03   3.925 9.12e-05 ***
## ms_zoning_rm           -6.218e-02  9.787e-03  -6.353 2.87e-10 ***
## sale_type_new           8.657e-02  2.039e-02   4.247 2.31e-05 ***
## functional_typ          5.328e-02  1.270e-02   4.194 2.92e-05 ***
## condition1norm          4.382e-02  9.923e-03   4.416 1.08e-05 ***
## bsmt_exposure_gd        4.001e-02  1.154e-02   3.468 0.000541 ***
## fireplace_qu_none      -5.418e-03  1.341e-02  -0.404 0.686325    
## foundation_p_conc       2.307e-02  8.838e-03   2.610 0.009152 ** 
## garage_area             9.963e-05  2.966e-05   3.359 0.000803 ***
## neighborhood_stone_br   1.299e-01  2.225e-02   5.837 6.63e-09 ***
## exterior1st_brk_face    8.696e-02  2.069e-02   4.203 2.81e-05 ***
## gr_liv_area             3.839e-05  2.666e-05   1.440 0.150135    
## neighborhood_nridg_ht   7.552e-02  1.546e-02   4.886 1.15e-06 ***
## screen_porch            2.251e-04  5.082e-05   4.430 1.02e-05 ***
## wood_deck_sf            9.460e-05  2.412e-05   3.921 9.24e-05 ***
## neighborhood_somerst    5.219e-02  1.319e-02   3.958 7.95e-05 ***
## year_remod_add          8.439e-04  2.041e-04   4.134 3.78e-05 ***
## neighborhood_brk_side   4.458e-02  1.492e-02   2.987 0.002868 ** 
## heating_grav           -1.573e-01  4.092e-02  -3.845 0.000126 ***
## neighborhood_no_ridge   7.181e-02  1.903e-02   3.774 0.000168 ***
## bsmt_full_bath          2.620e-02  7.358e-03   3.561 0.000382 ***
## functional_maj2        -2.728e-01  4.969e-02  -5.490 4.80e-08 ***
## neighborhood_edwards   -5.540e-02  1.159e-02  -4.779 1.96e-06 ***
## garage_cond_fa         -5.952e-02  1.844e-02  -3.228 0.001276 ** 
## pool_qc_gd              2.621e-01  7.277e-02   3.602 0.000327 ***
## sale_type_con_ld        1.134e-01  3.499e-02   3.241 0.001219 ** 
## sale_condition_normal   3.493e-02  1.715e-02   2.036 0.041923 *  
## functional_sev         -2.584e-01  1.055e-01  -2.449 0.014455 *  
## bldg_type_twnhs        -5.236e-02  1.856e-02  -2.821 0.004857 ** 
## kitchen_abv_gr         -5.459e-02  1.648e-02  -3.313 0.000947 ***
## bsmt_qual_ex            4.047e-02  1.318e-02   3.070 0.002186 ** 
## neighborhood_meadow_v  -9.322e-02  2.765e-02  -3.371 0.000770 ***
## lot_config_cul_d_sac    4.125e-02  1.171e-02   3.522 0.000443 ***
## lot_frontage            4.000e-04  1.694e-04   2.361 0.018364 *  
## roof_matl_wd_shngl      6.308e-02  4.722e-02   1.336 0.181812    
## condition2pos_a         4.246e-01  1.038e-01   4.091 4.55e-05 ***
## foundation_stone        1.064e-01  4.250e-02   2.504 0.012408 *  
## central_air_n          -4.767e-02  1.387e-02  -3.437 0.000606 ***
## neighborhood_mitchel   -3.672e-02  1.558e-02  -2.356 0.018590 *  
## bsmt_fin_type2blq      -3.193e-02  1.963e-02  -1.627 0.103975    
## functional_mod         -8.554e-02  3.012e-02  -2.840 0.004585 ** 
## heating_gas_w           7.884e-02  2.671e-02   2.952 0.003215 ** 
## exterior1st_brk_comm   -1.953e-01  7.560e-02  -2.583 0.009903 ** 
## bsmt_exposure_no       -1.338e-02  6.722e-03  -1.990 0.046788 *  
## land_slope_sev         -2.409e-01  4.418e-02  -5.452 5.90e-08 ***
## roof_matl_membran       3.748e-01  1.131e-01   3.315 0.000940 ***
## neighborhood_clear_cr   2.828e-02  2.227e-02   1.270 0.204361    
## exterior1st_wd_sdng    -2.363e-02  8.896e-03  -2.656 0.007992 ** 
## garage_qual_ex          4.202e-01  1.218e-01   3.449 0.000579 ***
## electrical_s_brkr      -1.794e-02  1.107e-02  -1.621 0.105243    
## condition1artery       -5.343e-02  1.815e-02  -2.943 0.003300 ** 
## condition1rr_ae        -7.550e-02  3.219e-02  -2.345 0.019163 *  
## garage_type2types      -1.093e-01  4.346e-02  -2.516 0.011999 *  
## house_style1_5unf       5.291e-02  2.876e-02   1.840 0.065989 .  
## exterior1st_hd_board   -2.232e-02  8.405e-03  -2.656 0.008000 ** 
## enclosed_porch          9.642e-05  5.029e-05   1.917 0.055413 .  
## roof_matl_metal         2.827e-01  1.103e-01   2.563 0.010474 *  
## foundation_wood        -1.260e-01  5.903e-02  -2.135 0.032920 *  
## bsmt_fin_type2unf       1.703e-02  9.223e-03   1.847 0.064981 .  
## fence_gd_wo            -3.274e-02  1.437e-02  -2.279 0.022824 *  
## exterior2nd_brk_face   -5.595e-02  2.823e-02  -1.982 0.047685 *  
## exter_cond_ta           2.110e-02  8.839e-03   2.388 0.017090 *  
## bsmt_cond_po            1.607e-01  8.073e-02   1.991 0.046661 *  
## garage_cond_ex         -3.224e-01  1.414e-01  -2.280 0.022779 *  
## bsmt_fin_type1unf      -1.600e-02  9.163e-03  -1.747 0.080931 .  
## house_style2_5fin      -5.933e-02  4.102e-02  -1.447 0.148258    
## lot_config_corner       1.226e-02  7.241e-03   1.693 0.090662 .  
## condition1pos_n         4.469e-02  2.666e-02   1.677 0.093860 .  
## sale_type_con           1.145e-01  7.209e-02   1.589 0.112342    
## land_slope_gtl         -2.275e-02  1.416e-02  -1.607 0.108371    
## sale_type_cwd           8.159e-02  5.160e-02   1.581 0.114034    
## mas_vnr_type_brk_cmn   -4.944e-02  2.777e-02  -1.780 0.075248 .  
## pool_qc_ex              1.254e-01  7.882e-02   1.591 0.111789    
## mas_vnr_area            2.904e-05  1.868e-05   1.554 0.120350    
## roof_style_shed         3.091e-01  1.096e-01   2.821 0.004864 ** 
## condition2rr_ae        -3.422e-01  1.506e-01  -2.273 0.023191 *  
## fireplaces              1.600e-02  1.059e-02   1.510 0.131179    
## exterior2nd_plywood    -1.478e-02  1.018e-02  -1.452 0.146825    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.00988336)
## 
##     Null deviance: 232.801  on 1459  degrees of freedom
## Residual deviance:  13.501  on 1366  degrees of freedom
## AIC: -2504.5
## 
## Number of Fisher Scoring iterations: 2
defaultSummary(data.frame(pred = predict(fit2), obs = train$log_sale_price))
##       RMSE   Rsquared        MAE 
## 0.09616150 0.94200760 0.06774188
PRESS(fit2$finalModel) #Press not working with caret models
## [1] Inf
varImp(fit2$finalModel)%>%
  filter(Overall > 4) %>%
  arrange(desc(Overall))
##                         Overall
## roof_matl_cly_tile    16.670625
## overall_cond          12.316543
## ms_zoning_c_all       12.095573
## total_bsmt_sf         11.388951
## condition2pos_n        9.922932
## year_built             9.699077
## overall_qual           9.667944
## log_gr_liv_area        9.091300
## neighborhood_crawfor   7.850502
## lot_area               7.018695
## ms_zoning_rm           6.353041
## neighborhood_stone_br  5.836962
## kitchen_qual_ex        5.560925
## functional_maj2        5.489575
## land_slope_sev         5.452246
## neighborhood_nridg_ht  4.886112
## bsmt_unf_sf            4.847542
## neighborhood_edwards   4.778576
## screen_porch           4.430030
## condition1norm         4.416121
## sale_type_new          4.246877
## exterior1st_brk_face   4.202900
## functional_typ         4.193654
## year_remod_add         4.134330
## condition2pos_a        4.090827
# Output the predictions for the test set to a csv file
# fit2x <- glm(formula = formula(fit2), data = train)
forward_pred <- predict(fit2$finalModel, test)

forward_pred %>%
  data.frame() %>%
  rownames_to_column(var = "id") %>%
  mutate(SalePrice = exp(forward_pred)) %>%
  dplyr::select(id, SalePrice) %>%
  write_csv("Predictions/forward_predictions.csv")

Forward: AIC -2504, RMSE: .096, R2: 0.942. Adjusted R2 is probably in there, but I’m not sure how to find it.

Backward Selection

# Backwards Selection

# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_backwards.rds")) {
  # Load the model object from disk
  fit3 <- readRDS("Models/lm_backwards.rds")
} else {
  # Perform stepwise selection

  # Set up a parallel backend with the number of cores you want to use
  cores <- 8 # Change this to the number of cores you want to use
  cl <- makePSOCKcluster(cores)
  registerDoParallel(cl)

  set.seed(137)
  fit3 <- train(log_sale_price ~ . - sale_price,
    data = train,
    method = "glmStepAIC",
    trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE),
    direction = "backward",
    penter = 0.05 # Not Working.
  )

  # Stop the parallel backend
  stopCluster(cl)
  
  # Save the model object to disk
  saveRDS(fit3, "Models/lm_backwards.rds")
}

summary(fit3$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.67457  -0.04458   0.00227   0.04964   0.56677  
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.883e+00  6.580e-01   5.902 4.55e-09 ***
## ms_sub_class            -4.289e-04  1.145e-04  -3.745 0.000188 ***
## ms_zoning_c_all         -3.520e-01  3.882e-02  -9.066  < 2e-16 ***
## ms_zoning_fv             7.715e-02  1.916e-02   4.027 5.96e-05 ***
## ms_zoning_rl             3.115e-02  1.224e-02   2.545 0.011029 *  
## lot_frontage             3.672e-04  1.754e-04   2.094 0.036471 *  
## lot_area                 2.977e-06  4.236e-07   7.028 3.33e-12 ***
## street_grvl             -9.790e-02  4.579e-02  -2.138 0.032720 *  
## lot_shape_ir2            2.714e-02  1.725e-02   1.573 0.116000    
## land_contour_bnk        -2.563e-02  1.453e-02  -1.763 0.078094 .  
## land_contour_low        -3.702e-02  2.221e-02  -1.667 0.095824 .  
## utilities_all_pub        1.908e-01  1.028e-01   1.856 0.063689 .  
## lot_config_corner        1.345e-02  7.317e-03   1.838 0.066279 .  
## lot_config_cul_d_sac     3.677e-02  1.203e-02   3.057 0.002277 ** 
## lot_config_fr2          -2.216e-02  1.555e-02  -1.425 0.154307    
## land_slope_gtl           2.234e-01  4.614e-02   4.842 1.44e-06 ***
## land_slope_mod           2.558e-01  4.641e-02   5.513 4.24e-08 ***
## neighborhood_br_dale    -6.804e-02  3.309e-02  -2.056 0.039990 *  
## neighborhood_collg_cr   -2.287e-02  1.203e-02  -1.901 0.057538 .  
## neighborhood_crawfor     9.867e-02  1.722e-02   5.732 1.23e-08 ***
## neighborhood_edwards    -8.963e-02  1.338e-02  -6.698 3.11e-11 ***
## neighborhood_gilbert    -2.202e-02  1.503e-02  -1.466 0.142963    
## neighborhood_idotrr     -5.114e-02  2.366e-02  -2.161 0.030861 *  
## neighborhood_meadow_v   -1.507e-01  2.989e-02  -5.041 5.28e-07 ***
## neighborhood_mitchel    -6.355e-02  1.686e-02  -3.770 0.000170 ***
## neighborhood_n_ames     -4.350e-02  1.164e-02  -3.736 0.000195 ***
## neighborhood_no_ridge    4.911e-02  1.973e-02   2.490 0.012913 *  
## neighborhood_nridg_ht    6.064e-02  1.609e-02   3.768 0.000172 ***
## neighborhood_nw_ames    -4.864e-02  1.524e-02  -3.192 0.001446 ** 
## neighborhood_old_town   -5.997e-02  1.632e-02  -3.673 0.000249 ***
## neighborhood_sawyer     -3.586e-02  1.499e-02  -2.392 0.016878 *  
## neighborhood_stone_br    1.166e-01  2.251e-02   5.180 2.56e-07 ***
## condition1artery        -9.408e-02  1.666e-02  -5.648 1.99e-08 ***
## condition1feedr         -4.539e-02  1.246e-02  -3.641 0.000282 ***
## condition1pos_a         -5.644e-02  3.771e-02  -1.496 0.134772    
## condition1rr_ae         -1.374e-01  3.161e-02  -4.345 1.50e-05 ***
## condition2pos_a          3.256e-01  1.066e-01   3.053 0.002311 ** 
## condition2pos_n         -7.624e-01  7.411e-02 -10.288  < 2e-16 ***
## condition2rr_ae         -4.546e-01  1.522e-01  -2.988 0.002864 ** 
## bldg_type2fm_con         5.721e-02  2.574e-02   2.222 0.026418 *  
## bldg_type_twnhs         -3.499e-02  2.020e-02  -1.732 0.083469 .  
## house_style1_5fin       -2.753e-02  1.378e-02  -1.997 0.046028 *  
## house_style1_5unf        4.888e-02  2.922e-02   1.673 0.094655 .  
## house_style2_5fin       -1.038e-01  4.435e-02  -2.340 0.019449 *  
## house_style2story       -5.251e-02  1.540e-02  -3.409 0.000672 ***
## house_style_s_foyer      2.854e-02  1.967e-02   1.451 0.147089    
## overall_qual             3.638e-02  4.021e-03   9.048  < 2e-16 ***
## overall_cond             3.993e-02  3.472e-03  11.498  < 2e-16 ***
## year_built               1.773e-03  2.662e-04   6.663 3.91e-11 ***
## year_remod_add           7.281e-04  2.150e-04   3.387 0.000727 ***
## roof_style_flat         -3.546e-01  1.126e-01  -3.150 0.001668 ** 
## roof_style_gable        -3.568e-01  1.105e-01  -3.228 0.001277 ** 
## roof_style_gambrel      -3.561e-01  1.151e-01  -3.093 0.002023 ** 
## roof_style_hip          -3.557e-01  1.108e-01  -3.212 0.001352 ** 
## roof_style_mansard      -3.184e-01  1.170e-01  -2.721 0.006598 ** 
## roof_matl_cly_tile      -2.185e+00  1.893e-01 -11.543  < 2e-16 ***
## roof_matl_membran        4.269e-01  1.168e-01   3.655 0.000268 ***
## roof_matl_metal          3.530e-01  1.127e-01   3.133 0.001769 ** 
## exterior1st_brk_comm    -1.897e-01  7.826e-02  -2.424 0.015495 *  
## exterior1st_brk_face     9.871e-02  2.080e-02   4.745 2.31e-06 ***
## exterior1st_wd_sdng     -2.317e-02  8.947e-03  -2.589 0.009717 ** 
## exterior2nd_asb_shng    -4.292e-02  2.426e-02  -1.769 0.077070 .  
## exterior2nd_brk_face    -6.603e-02  2.836e-02  -2.328 0.020037 *  
## exterior2nd_hd_board    -2.396e-02  9.108e-03  -2.630 0.008633 ** 
## exterior2nd_plywood     -2.385e-02  1.093e-02  -2.183 0.029215 *  
## mas_vnr_type_brk_cmn    -4.568e-02  2.797e-02  -1.633 0.102662    
## mas_vnr_area             3.606e-05  1.946e-05   1.853 0.064166 .  
## exter_cond_fa           -4.075e-02  2.302e-02  -1.770 0.076924 .  
## exter_cond_gd           -2.126e-02  9.648e-03  -2.204 0.027701 *  
## foundation_brk_til       1.267e-01  6.062e-02   2.090 0.036785 *  
## foundation_c_block       1.477e-01  5.966e-02   2.476 0.013409 *  
## foundation_p_conc        1.623e-01  5.925e-02   2.739 0.006239 ** 
## foundation_slab          1.270e-01  6.523e-02   1.947 0.051733 .  
## foundation_stone         2.217e-01  7.402e-02   2.994 0.002800 ** 
## bsmt_qual_ex             3.901e-02  1.336e-02   2.919 0.003571 ** 
## bsmt_cond_po             2.904e-01  1.187e-01   2.446 0.014582 *  
## bsmt_exposure_av         1.922e-02  8.624e-03   2.229 0.025974 *  
## bsmt_exposure_gd         5.220e-02  1.166e-02   4.476 8.24e-06 ***
## bsmt_fin_type1glq        1.744e-02  8.675e-03   2.011 0.044541 *  
## bsmt_fin_sf1             1.412e-04  1.597e-05   8.842  < 2e-16 ***
## bsmt_fin_type2blq       -3.959e-02  1.889e-02  -2.096 0.036245 *  
## bsmt_fin_sf2             1.250e-04  2.243e-05   5.574 3.00e-08 ***
## bsmt_unf_sf              8.153e-05  1.433e-05   5.688 1.57e-08 ***
## heating_floor           -1.470e-01  1.062e-01  -1.384 0.166597    
## heating_gas_a           -4.891e-02  2.479e-02  -1.973 0.048689 *  
## heating_grav            -2.057e-01  4.845e-02  -4.244 2.34e-05 ***
## heating_qc_ex            2.554e-02  7.192e-03   3.552 0.000396 ***
## central_air_n           -4.817e-02  1.474e-02  -3.267 0.001113 ** 
## electrical_mix          -2.741e-01  1.709e-01  -1.603 0.109070    
## x2nd_flr_sf              6.189e-05  1.964e-05   3.150 0.001667 ** 
## bsmt_full_bath           2.530e-02  7.627e-03   3.317 0.000933 ***
## full_bath                1.444e-02  8.842e-03   1.633 0.102765    
## half_bath                1.566e-02  8.588e-03   1.823 0.068490 .  
## kitchen_abv_gr          -6.371e-02  1.633e-02  -3.902 0.000100 ***
## kitchen_qual_ex          7.942e-02  1.339e-02   5.930 3.86e-09 ***
## functional_maj1         -6.194e-02  2.986e-02  -2.075 0.038216 *  
## functional_maj2         -3.103e-01  5.107e-02  -6.076 1.60e-09 ***
## functional_min1         -4.034e-02  1.949e-02  -2.069 0.038705 *  
## functional_min2         -5.192e-02  1.883e-02  -2.758 0.005895 ** 
## functional_mod          -1.444e-01  2.911e-02  -4.959 7.99e-07 ***
## functional_sev          -3.109e-01  1.115e-01  -2.789 0.005359 ** 
## fireplaces               1.945e-02  5.429e-03   3.583 0.000351 ***
## garage_type2types       -9.648e-02  4.396e-02  -2.195 0.028344 *  
## garage_yr_blt           -3.203e-04  2.231e-04  -1.436 0.151321    
## garage_cars              2.131e-02  8.954e-03   2.380 0.017473 *  
## garage_area              1.260e-04  3.119e-05   4.041 5.63e-05 ***
## garage_qual_ex           4.518e-01  1.135e-01   3.981 7.23e-05 ***
## garage_qual_fa          -3.562e-02  1.938e-02  -1.838 0.066330 .  
## garage_cond_ex          -3.720e-01  1.342e-01  -2.772 0.005653 ** 
## garage_cond_fa          -3.758e-02  2.122e-02  -1.771 0.076769 .  
## garage_cond_po           6.509e-02  4.779e-02   1.362 0.173439    
## wood_deck_sf             9.315e-05  2.399e-05   3.884 0.000108 ***
## enclosed_porch           8.231e-05  5.040e-05   1.633 0.102699    
## screen_porch             2.401e-04  5.211e-05   4.609 4.44e-06 ***
## pool_area                1.446e-03  6.946e-04   2.082 0.037559 *  
## pool_qc_ex              -6.779e-01  3.796e-01  -1.786 0.074312 .  
## pool_qc_fa              -8.767e-01  4.133e-01  -2.121 0.034081 *  
## pool_qc_gd              -6.705e-01  4.577e-01  -1.465 0.143214    
## fence_gd_wo             -3.508e-02  1.460e-02  -2.402 0.016436 *  
## sale_type_con            1.146e-01  7.176e-02   1.596 0.110650    
## sale_type_con_ld         1.336e-01  3.621e-02   3.689 0.000234 ***
## sale_type_cwd            8.893e-02  5.156e-02   1.725 0.084786 .  
## sale_type_new            1.025e-01  1.486e-02   6.900 8.01e-12 ***
## sale_condition_adj_land  7.949e-02  5.470e-02   1.453 0.146406    
## sale_condition_normal    5.248e-02  9.916e-03   5.292 1.41e-07 ***
## log_gr_liv_area          4.012e-01  2.362e-02  16.984  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009762914)
## 
##     Null deviance: 232.801  on 1459  degrees of freedom
## Residual deviance:  13.024  on 1334  degrees of freedom
## AIC: -2493.1
## 
## Number of Fisher Scoring iterations: 2
defaultSummary(data.frame(pred = predict(fit3), obs = train$log_sale_price))
##       RMSE   Rsquared        MAE 
## 0.09444766 0.94405631 0.06608136
PRESS(fit3$finalModel) # Press not working with caret models
## [1] Inf
varImp(fit3$finalModel)%>%
  filter(Overall > 4) %>%
  arrange(desc(Overall))
##                         Overall
## log_gr_liv_area       16.984408
## roof_matl_cly_tile    11.542834
## overall_cond          11.497800
## condition2pos_n       10.287568
## ms_zoning_c_all        9.065580
## overall_qual           9.047682
## bsmt_fin_sf1           8.842386
## lot_area               7.027994
## sale_type_new          6.900116
## neighborhood_edwards   6.697630
## year_built             6.663097
## functional_maj2        6.076291
## kitchen_qual_ex        5.929819
## neighborhood_crawfor   5.731806
## bsmt_unf_sf            5.688449
## condition1artery       5.647557
## bsmt_fin_sf2           5.574467
## land_slope_mod         5.512792
## sale_condition_normal  5.292479
## neighborhood_stone_br  5.180033
## neighborhood_meadow_v  5.040680
## functional_mod         4.959119
## land_slope_gtl         4.841711
## exterior1st_brk_face   4.744673
## screen_porch           4.608665
## bsmt_exposure_gd       4.476437
## condition1rr_ae        4.345298
## heating_grav           4.244451
## garage_area            4.040683
## ms_zoning_fv           4.027130
# Output the predictions for the test set to a csv file
# fit3x <- glm(formula = formula(fit3), data = train)
backward_pred <- predict(fit3$finalModel, newdata = test)

backward_pred %>%
  data.frame() %>%
  rownames_to_column(var = "id") %>%
  mutate(SalePrice = exp(backward_pred)) %>%
  dplyr::select(id, SalePrice) %>%
  write_csv("Predictions/backward_predictions.csv")

Backward: AIC: -2493, RMSE: .0944, R2: 0.944

Stepwise Selection

# Stepwise Selection

# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_stepwise.rds")) {
  # Load the model object from disk
  fit4 <- readRDS("Models/lm_stepwise.rds")
} else {
  # Perform stepwise selection

  # Set up a parallel backend with the number of cores you want to use
  cores <- 8 # Change this to the number of cores you want to use
  cl <- makePSOCKcluster(cores)
  registerDoParallel(cl)

  set.seed(137)
  fit4 <- train(log_sale_price ~ . - sale_price,
    data = train,
    method = "glmStepAIC",
    trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE),
    direction = "both",
    penter = 0.05 # Not Working.
  )
  
  # Stop the parallel backend
  stopCluster(cl)
  
  # Save the model object to disk
  saveRDS(fit4, "Models/lm_stepwise.rds")
}

summary(fit4$finalModel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.67385  -0.04499   0.00226   0.04994   0.56163  
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.870e+00  6.587e-01   5.875 5.35e-09 ***
## ms_sub_class            -4.289e-04  1.145e-04  -3.746 0.000188 ***
## ms_zoning_c_all         -3.515e-01  3.879e-02  -9.062  < 2e-16 ***
## ms_zoning_fv             7.642e-02  1.914e-02   3.992 6.92e-05 ***
## ms_zoning_rl             2.943e-02  1.230e-02   2.394 0.016823 *  
## lot_frontage             3.848e-04  1.754e-04   2.194 0.028443 *  
## lot_area                 2.899e-06  4.260e-07   6.807 1.51e-11 ***
## street_grvl             -9.626e-02  4.576e-02  -2.104 0.035599 *  
## lot_shape_ir2            2.804e-02  1.725e-02   1.625 0.104295    
## land_contour_bnk        -2.574e-02  1.452e-02  -1.772 0.076560 .  
## land_contour_low        -4.131e-02  2.234e-02  -1.849 0.064644 .  
## utilities_all_pub        1.911e-01  1.027e-01   1.861 0.063023 .  
## lot_config_corner        1.286e-02  7.315e-03   1.758 0.078980 .  
## lot_config_cul_d_sac     3.778e-02  1.203e-02   3.141 0.001722 ** 
## lot_config_fr2          -2.215e-02  1.554e-02  -1.425 0.154248    
## lot_config_fr3          -6.926e-02  5.109e-02  -1.355 0.175506    
## land_slope_gtl           2.144e-01  4.640e-02   4.620 4.20e-06 ***
## land_slope_mod           2.476e-01  4.661e-02   5.313 1.26e-07 ***
## neighborhood_br_dale    -6.773e-02  3.307e-02  -2.048 0.040755 *  
## neighborhood_collg_cr   -2.313e-02  1.202e-02  -1.924 0.054566 .  
## neighborhood_crawfor     9.824e-02  1.720e-02   5.711 1.39e-08 ***
## neighborhood_edwards    -9.012e-02  1.337e-02  -6.740 2.36e-11 ***
## neighborhood_gilbert    -2.255e-02  1.502e-02  -1.502 0.133422    
## neighborhood_idotrr     -5.336e-02  2.369e-02  -2.253 0.024418 *  
## neighborhood_meadow_v   -1.517e-01  2.989e-02  -5.078 4.36e-07 ***
## neighborhood_mitchel    -6.330e-02  1.684e-02  -3.758 0.000178 ***
## neighborhood_n_ames     -4.368e-02  1.163e-02  -3.755 0.000181 ***
## neighborhood_no_ridge    4.883e-02  1.971e-02   2.477 0.013368 *  
## neighborhood_nridg_ht    6.432e-02  1.619e-02   3.974 7.46e-05 ***
## neighborhood_nw_ames    -4.963e-02  1.523e-02  -3.258 0.001152 ** 
## neighborhood_old_town   -6.204e-02  1.636e-02  -3.792 0.000156 ***
## neighborhood_sawyer     -3.616e-02  1.498e-02  -2.414 0.015899 *  
## neighborhood_stone_br    1.183e-01  2.251e-02   5.255 1.72e-07 ***
## condition1artery        -9.396e-02  1.664e-02  -5.646 2.01e-08 ***
## condition1feedr         -4.637e-02  1.247e-02  -3.720 0.000207 ***
## condition1rr_ae         -1.374e-01  3.158e-02  -4.350 1.46e-05 ***
## condition2pos_a          3.291e-01  1.066e-01   3.088 0.002057 ** 
## condition2pos_n         -7.581e-01  7.408e-02 -10.233  < 2e-16 ***
## condition2rr_ae         -4.540e-01  1.520e-01  -2.987 0.002871 ** 
## bldg_type2fm_con         5.696e-02  2.572e-02   2.214 0.026973 *  
## bldg_type_twnhs         -3.701e-02  2.021e-02  -1.831 0.067303 .  
## house_style1_5fin       -2.637e-02  1.379e-02  -1.913 0.056014 .  
## house_style1_5unf        4.834e-02  2.920e-02   1.655 0.098144 .  
## house_style2_5fin       -1.017e-01  4.432e-02  -2.296 0.021855 *  
## house_style2story       -5.083e-02  1.541e-02  -3.298 0.000998 ***
## house_style_s_foyer      2.841e-02  1.965e-02   1.445 0.148553    
## overall_qual             3.593e-02  4.027e-03   8.923  < 2e-16 ***
## overall_cond             4.004e-02  3.470e-03  11.541  < 2e-16 ***
## year_built               1.783e-03  2.662e-04   6.699 3.09e-11 ***
## year_remod_add           7.367e-04  2.148e-04   3.430 0.000623 ***
## roof_style_flat         -3.563e-01  1.125e-01  -3.169 0.001566 ** 
## roof_style_gable        -3.587e-01  1.104e-01  -3.248 0.001190 ** 
## roof_style_gambrel      -3.589e-01  1.150e-01  -3.120 0.001847 ** 
## roof_style_hip          -3.564e-01  1.107e-01  -3.221 0.001311 ** 
## roof_style_mansard      -3.188e-01  1.169e-01  -2.727 0.006470 ** 
## roof_matl_cly_tile      -2.170e+00  1.893e-01 -11.464  < 2e-16 ***
## roof_matl_membran        4.247e-01  1.167e-01   3.638 0.000285 ***
## roof_matl_metal          3.488e-01  1.126e-01   3.098 0.001989 ** 
## exterior1st_brk_comm    -1.897e-01  7.819e-02  -2.426 0.015412 *  
## exterior1st_brk_face     9.884e-02  2.078e-02   4.755 2.20e-06 ***
## exterior1st_wd_sdng     -2.245e-02  8.945e-03  -2.509 0.012214 *  
## exterior2nd_asb_shng    -4.330e-02  2.424e-02  -1.787 0.074222 .  
## exterior2nd_brk_face    -6.849e-02  2.836e-02  -2.415 0.015875 *  
## exterior2nd_hd_board    -2.379e-02  9.100e-03  -2.614 0.009050 ** 
## exterior2nd_plywood     -2.363e-02  1.092e-02  -2.165 0.030543 *  
## mas_vnr_type_brk_cmn    -4.461e-02  2.795e-02  -1.596 0.110721    
## mas_vnr_area             3.470e-05  1.946e-05   1.783 0.074840 .  
## exter_cond_fa           -4.078e-02  2.300e-02  -1.773 0.076491 .  
## exter_cond_gd           -2.089e-02  9.644e-03  -2.166 0.030473 *  
## foundation_brk_til       1.279e-01  6.056e-02   2.112 0.034893 *  
## foundation_c_block       1.487e-01  5.961e-02   2.494 0.012758 *  
## foundation_p_conc        1.632e-01  5.920e-02   2.757 0.005908 ** 
## foundation_slab          1.256e-01  6.517e-02   1.927 0.054188 .  
## foundation_stone         2.226e-01  7.395e-02   3.010 0.002660 ** 
## bsmt_qual_ex             3.767e-02  1.337e-02   2.817 0.004914 ** 
## bsmt_cond_po             2.893e-01  1.187e-01   2.438 0.014890 *  
## bsmt_exposure_av         1.999e-02  8.628e-03   2.317 0.020656 *  
## bsmt_exposure_gd         5.066e-02  1.168e-02   4.338 1.55e-05 ***
## bsmt_fin_type1glq        1.752e-02  8.668e-03   2.021 0.043428 *  
## bsmt_fin_sf1             1.396e-04  1.598e-05   8.737  < 2e-16 ***
## bsmt_fin_type2blq       -3.971e-02  1.887e-02  -2.104 0.035542 *  
## bsmt_fin_sf2             1.236e-04  2.242e-05   5.515 4.19e-08 ***
## bsmt_unf_sf              7.977e-05  1.436e-05   5.556 3.32e-08 ***
## heating_floor           -1.526e-01  1.061e-01  -1.438 0.150690    
## heating_gas_a           -5.058e-02  2.479e-02  -2.040 0.041498 *  
## heating_grav            -2.070e-01  4.842e-02  -4.275 2.04e-05 ***
## heating_qc_ex            2.629e-02  7.194e-03   3.654 0.000268 ***
## central_air_n           -4.895e-02  1.474e-02  -3.322 0.000918 ***
## electrical_mix          -2.711e-01  1.708e-01  -1.587 0.112792    
## x2nd_flr_sf              5.997e-05  1.965e-05   3.052 0.002315 ** 
## bsmt_full_bath           2.547e-02  7.622e-03   3.342 0.000856 ***
## full_bath                1.567e-02  8.856e-03   1.769 0.077108 .  
## half_bath                1.527e-02  8.590e-03   1.777 0.075740 .  
## kitchen_abv_gr          -6.334e-02  1.632e-02  -3.882 0.000109 ***
## kitchen_qual_ex          7.973e-02  1.338e-02   5.958 3.26e-09 ***
## functional_maj1         -6.256e-02  2.983e-02  -2.097 0.036152 *  
## functional_maj2         -3.114e-01  5.102e-02  -6.103 1.36e-09 ***
## functional_min1         -4.067e-02  1.947e-02  -2.089 0.036930 *  
## functional_min2         -5.207e-02  1.881e-02  -2.769 0.005708 ** 
## functional_mod          -1.442e-01  2.909e-02  -4.956 8.10e-07 ***
## functional_sev          -3.118e-01  1.114e-01  -2.800 0.005180 ** 
## fireplaces               1.968e-02  5.430e-03   3.624 0.000301 ***
## garage_type2types       -9.617e-02  4.392e-02  -2.190 0.028724 *  
## garage_yr_blt           -3.249e-04  2.232e-04  -1.456 0.145706    
## garage_cars              2.158e-02  8.946e-03   2.412 0.015995 *  
## garage_area              1.255e-04  3.116e-05   4.027 5.98e-05 ***
## garage_qual_ex           3.728e-01  1.229e-01   3.032 0.002474 ** 
## garage_qual_fa          -3.683e-02  1.941e-02  -1.897 0.058026 .  
## garage_cond_ex          -2.936e-01  1.421e-01  -2.066 0.039012 *  
## garage_cond_fa          -3.443e-02  2.133e-02  -1.615 0.106608    
## garage_cond_po           6.682e-02  4.776e-02   1.399 0.162004    
## wood_deck_sf             8.875e-05  2.412e-05   3.680 0.000242 ***
## enclosed_porch           8.528e-05  5.039e-05   1.692 0.090813 .  
## screen_porch             2.428e-04  5.208e-05   4.662 3.44e-06 ***
## pool_area                1.455e-03  6.940e-04   2.096 0.036241 *  
## pool_qc_ex              -6.816e-01  3.792e-01  -1.797 0.072492 .  
## pool_qc_fa              -8.811e-01  4.129e-01  -2.134 0.033033 *  
## pool_qc_gd              -6.754e-01  4.573e-01  -1.477 0.139923    
## fence_gd_wo             -3.495e-02  1.459e-02  -2.396 0.016726 *  
## sale_type_con            1.165e-01  7.171e-02   1.625 0.104492    
## sale_type_con_ld         1.345e-01  3.618e-02   3.719 0.000208 ***
## sale_type_cwd            8.900e-02  5.151e-02   1.728 0.084270 .  
## sale_type_new            1.020e-01  1.486e-02   6.863 1.03e-11 ***
## sale_condition_adj_land  8.051e-02  5.466e-02   1.473 0.140988    
## sale_condition_normal    5.254e-02  9.907e-03   5.303 1.33e-07 ***
## log_gr_liv_area          4.012e-01  2.360e-02  16.999  < 2e-16 ***
## roof_matl_wd_shngl       7.825e-02  4.723e-02   1.657 0.097822 .  
## condition1pos_a         -5.642e-02  3.768e-02  -1.497 0.134543    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.009744362)
## 
##     Null deviance: 232.801  on 1459  degrees of freedom
## Residual deviance:  12.979  on 1332  degrees of freedom
## AIC: -2494
## 
## Number of Fisher Scoring iterations: 2
defaultSummary(data.frame(pred = predict(fit4), obs = train$log_sale_price))
##       RMSE   Rsquared        MAE 
## 0.09428712 0.94424633 0.06603357
PRESS(fit4$finalModel) # Press not working with caret models
## [1] Inf
varImp(fit4$finalModel) %>%
  filter(Overall > 4) %>%
  arrange(desc(Overall))
##                         Overall
## log_gr_liv_area       16.998839
## overall_cond          11.540682
## roof_matl_cly_tile    11.463697
## condition2pos_n       10.233332
## ms_zoning_c_all        9.061792
## overall_qual           8.923294
## bsmt_fin_sf1           8.736801
## sale_type_new          6.862606
## lot_area               6.806617
## neighborhood_edwards   6.739504
## year_built             6.698790
## functional_maj2        6.103470
## kitchen_qual_ex        5.958244
## neighborhood_crawfor   5.710567
## condition1artery       5.645616
## bsmt_unf_sf            5.556263
## bsmt_fin_sf2           5.514768
## land_slope_mod         5.312905
## sale_condition_normal  5.303034
## neighborhood_stone_br  5.254721
## neighborhood_meadow_v  5.077700
## functional_mod         4.956417
## exterior1st_brk_face   4.755398
## screen_porch           4.662298
## land_slope_gtl         4.620443
## condition1rr_ae        4.350094
## bsmt_exposure_gd       4.337773
## heating_grav           4.275294
## garage_area            4.026542
# Output the predictions for the test set to a csv file
# fit4x <- glm(formula = formula(fit4), data = train)
stepwise_pred <- predict(fit4$finalModel, newdata = test)

stepwise_pred %>%
  data.frame() %>%
  rownames_to_column(var = "id") %>%
  mutate(SalePrice = exp(stepwise_pred)) %>%
  dplyr::select(id, SalePrice) %>%
  write_csv("Predictions/stepwise_predictions.csv")

AIC: -2493, RMSE: .0944, R2: 0.944 (same as backward)

Custom features

For custom features, we wil use the top ten parameters ranked by importance from the stepwise model. We expect that this model will not perform as well, but each parameter will be more explainable, and fitting should require much less compute. The question we hope to answer is whether the increase in performance is worth the cost for one of the brute-force models, and whether ten parameters is too few to capture the complexity of the data.

# Custom Feature Selection
top10 <- varImp(fit4$finalModel) %>%
  filter(Overall > 4) %>%
  arrange(desc(Overall)) %>%
  head(10) %>%
  rownames()

form <- as.formula(paste("log_sale_price ~", paste(top10, collapse = "+")))
fit5 <- lm(form, data = train)


summary(fit5)
## 
## Call:
## lm(formula = form, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.88743 -0.08720  0.01853  0.10421  0.62038 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.693e+00  1.100e-01  69.902  < 2e-16 ***
## log_gr_liv_area       4.401e-01  1.647e-02  26.712  < 2e-16 ***
## overall_cond          2.339e-02  3.851e-03   6.074 1.59e-09 ***
## roof_matl_cly_tile   -2.526e+00  1.708e-01 -14.793  < 2e-16 ***
## condition2pos_n      -9.189e-01  1.159e-01  -7.930 4.35e-15 ***
## ms_zoning_c_all      -4.007e-01  5.160e-02  -7.765 1.53e-14 ***
## overall_qual          1.429e-01  4.195e-03  34.059  < 2e-16 ***
## bsmt_fin_sf1          1.989e-04  1.013e-05  19.635  < 2e-16 ***
## sale_type_new         1.788e-01  1.637e-02  10.924  < 2e-16 ***
## lot_area              3.665e-06  4.442e-07   8.250 3.53e-16 ***
## neighborhood_edwards -6.591e-02  1.718e-02  -3.837  0.00013 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1603 on 1449 degrees of freedom
## Multiple R-squared:  0.8401, Adjusted R-squared:  0.839 
## F-statistic: 761.5 on 10 and 1449 DF,  p-value: < 2.2e-16
defaultSummary(data.frame(pred = predict(fit5), obs = train$log_sale_price))
##      RMSE  Rsquared       MAE 
## 0.1596549 0.8401425 0.1198526
PRESS(fit5) # Press not working with caret models
## [1] Inf
# varImp(fit5$finalModel) %>%
  # filter(Overall > 4) %>%
  # arrange(desc(Overall))

# Output the predictions for the test set to a csv file
# fit4x <- glm(formula = formula(fit4), data = train)
custom_pred <- predict(fit5, newdata = test)

custom_pred %>%
  data.frame() %>%
  rownames_to_column(var = "id") %>%
  mutate(SalePrice = exp(custom_pred)) %>%
  dplyr::select(id, SalePrice) %>%
  write_csv("Predictions/custom_predictions.csv")

AIC: -1190, RMSE: .16, R2: 0.84 (worse but more parsimonious)

Model Evaluation

Adj R2, Internal CV press, Kaggle score

Checking Assumptions

Checking the assumptions of the forward model. The other models should be similar.

Residual Plots

# Plot the studentized residuals using base R
plot(fit2$finalModel$fitted.values, fit2$finalModel$residuals, type = "p")

plot(fit2$finalModel$residuals)

hist(fit2$finalModel$residuals, freq = FALSE)
curve(dnorm(x, mean = mean(fit2$finalModel$residuals), sd = sd(fit2$finalModel$residuals)), add = TRUE, col = "blue")

There is no evidence of non-linearity, heteroscedasticity, or non-normality.

Influential Points

# Plot the residuals vs. the fitted values
fit2$finalModel %>%
  plot()

There are several points with a cook’s d approximately 1 which could be investigated. However, the results are consistent with the assumptions of the linear model.

Conclusion

Conclusion text

Appendix

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

# Required Libraries
library(tidyverse)
library(knitr)
library(kableExtra)
library(ggthemes)
library(caret)
library(janitor)
library(doParallel)

#library(e1071)
#library(class)
train <- read.csv("Data/train.csv")
test <- read.csv("Data/test.csv")

# Merge the data frames and add a column indicating whether they come from the train or test set
train$train <- 1
test$SalePrice <- NA
test$train <- 0
ames <- rbind(train, test)

# Verify data frame
head(ames)
str(ames)
summary(ames)
# Summarize NA's by  column
ames %>%
  summarise_all(funs(sum(is.na(.)))) %>%
  gather(key = "Column", value = "NA_Count", -1) %>%
  filter(NA_Count > 0) %>%
  ggplot(aes(x = reorder(Column, NA_Count), y = NA_Count)) +
  geom_col() +
  coord_flip() +
  theme_gdocs() +
  labs(title = "Number of NA's by Column", x = "Column", y = "NA Count")

# Create a table of the missing NAs by column
ames %>%
  summarise_all(funs(sum(is.na(.)))) %>%
  gather(key = "Column", value = "NA_Count", -1) %>%
  filter(NA_Count > 0) %>%
  arrange(desc(NA_Count)) %>%
  kable()
# Plot Sale Price vs. Gross Living Area colored by neighborhood, omitting rows where SalePrice is NA
ames %>%
  filter(!is.na(SalePrice)) %>%
  ggplot(aes(x = GrLivArea, y = SalePrice, color = Neighborhood)) +
  geom_point() +
  theme_gdocs() +
  labs(title = "Sale Price vs. Gross Living Area by Neighborhood", x = "Gross Living Area", y = "Sale Price")
# Plot log(Sale Price) vs. log(Gross Living Area) colored by neighborhood, omitting rows where SalePrice is NA
ames %>%
  filter(!is.na(SalePrice)) %>%
  ggplot(aes(x = log(GrLivArea), y = log(SalePrice), color = Neighborhood)) +
  geom_point() +
  theme_gdocs() +
  labs(
    title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
    x = "log(Gross Living Area)",
    y = "log(Sale Price)"
  )
# Create columns for log(SalePrice) and log(GrLivArea)
ames$logSalePrice <- log(ames$SalePrice)
ames$logGrLivArea <- log(ames$GrLivArea)

PRESS <- function(linear.model) {
  #' calculate the predictive residuals
  pr <- residuals(linear.model) / (1 - lm.influence(linear.model)$hat)
  #' calculate the PRESS
  PRESS <- sum(pr^2)

  return(PRESS)
}
# Function for calculating PRESS
# Tom Hopper
# https://gist.github.com/tomhopper/8c204d978c4a0cbcb8c0
# Plot log(Sale Price) vs. log(Gross Living Area) colored by neighborhood, omitting rows where SalePrice is NA for only the neighborhoods of interest
century21 <-
  ames %>%
  filter(!is.na(SalePrice)) %>%
  filter(Neighborhood %in% c("NAmes", "Edwards", "BrkSide")) 
century21 %>%
  ggplot(aes(x = logGrLivArea, y = logSalePrice, color = Neighborhood)) +
  geom_point() +
  theme_gdocs() +
  labs(
    title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
    x = "log(Gross Living Area)",
    y = "log(Sale Price)"
  )
# Fit a linear model to the data
fit1x <- lm(logSalePrice ~ logGrLivArea + Neighborhood, data = century21)
summary(fit1x)
PRESS(fit1x)

# Fit a linear model to the data with interaction variables
fit1 <- lm(logSalePrice ~ logGrLivArea * Neighborhood, data = century21)
summary(fit1)
PRESS(fit1)
confint(fit1) %>% kable()

# Plot the data with the linear model superposed
century21 %>%
  ggplot(aes(x = logGrLivArea, y = logSalePrice, color = Neighborhood)) +
  geom_point() +
  theme_gdocs() +
  labs(
    title = "log(Sale Price) vs. log(Gross Living Area) by Neighborhood",
    x = "log(Gross Living Area)",
    y = "log(Sale Price)"
  ) +
  geom_smooth(
    method = "lm", formula = y ~ x, se = FALSE, linewidth = 1,
    data = data.frame(
      logGrLivArea = century21$logGrLivArea,
      Neighborhood = century21$Neighborhood,
      logSalePrice = predict(fit1)
    )
  )

# # Print parameter estimate table nicely. Not working, needs debugging
# fit1 %>%
#   summary() %>%
#   {cbind(as.data.frame(coef(.)), .[["coefficients"]][, 2:4])} %>%
#   setNames(c("Estimate", "Std. Error", "t-value", "Pr(>|t|)")) %>%
#   rownames_to_column(var = "Term") %>%
#   mutate(Term = ifelse(Term == "(Intercept)", "Intercept", Term)) %>%
#   add_row(Term = "Adjusted R-squared", Estimate = round(.$adj.r.squared, 3), Std..Error = NA, `t-value` = NA, `Pr(>|t|)` = NA) %>%
#   kable(digits = 3, align = "c") %>%
#   kable_styling(full_width = FALSE)
# Plot the studentized residuals using base R
plot(fit1$fitted.values, fit1$residuals, type = "p")
plot(fit1$residuals)

# Decide which of these we like better ggplot or R

# Calculate studentized residuals
stud_res <- rstudent(fit1)
# Create a data frame with the studentized residuals
df <- data.frame(stud_res, logGrLivArea = model.frame(fit1)$logGrLivArea)

# Create a scatterplot of the studentized residuals
ggplot(df, aes(x = logGrLivArea, y = stud_res)) +
  geom_point() +
  labs(title = "Scatterplot of Studentized Residuals",
  x = "Studentized Residuals",
  y = "Frequency") +
  theme_minimal()

# Create histogram with normal curve
ggplot(df, aes(x = stud_res)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.5, fill = "lightblue", color = "black") +
  stat_function(fun = dnorm, args = list(mean = mean(df$stud_res), sd = sd(df$stud_res)), color = "blue", size = 1.2) +
  labs(title = "Histogram of Studentized Residuals with Normal Curve",
  x = "Studentized Residuals",
  y = "Frequency") +
  theme_minimal()
# Plot the residuals vs. the fitted values
fit1 %>%
  plot()
# Data Cleaning

# If pool-related variables are NA, assume there is no pool and assign to 0
ames <- ames %>%
  mutate(
    PoolQC = ifelse(is.na(PoolQC), "None", PoolQC),
    PoolArea = ifelse(is.na(PoolArea), 0, PoolArea),
  )
# If garage-related variables are NA, assume there is no garage and assign to 0
ames <- ames %>%
  mutate(
    GarageType = ifelse(is.na(GarageType), "None", GarageType),
    #GarageYrBlt = ifelse(is.na(GarageYrBlt), 0, GarageYrBlt), #These will be changed to the mean because of large year values
    GarageFinish = ifelse(is.na(GarageFinish), "None", GarageFinish),
    GarageCars = ifelse(is.na(GarageCars), 0, GarageCars),
    GarageArea = ifelse(is.na(GarageArea), 0, GarageArea),
    GarageQual = ifelse(is.na(GarageQual), "None", GarageQual),
    GarageCond = ifelse(is.na(GarageCond), "None", GarageCond)
  )
# If Bsmt-related variables are NA, assume there is no Bsmt and assign to 0
ames <- ames %>%
  mutate(
    BsmtQual = ifelse(is.na(BsmtQual), "None", BsmtQual),
    BsmtCond = ifelse(is.na(BsmtCond), "None", BsmtCond),
    BsmtExposure = ifelse(is.na(BsmtExposure), "None", BsmtExposure),
    BsmtFinType1 = ifelse(is.na(BsmtFinType1), "None", BsmtFinType1),
    BsmtFinSF1 = ifelse(is.na(BsmtFinSF1), 0, BsmtFinSF1),
    BsmtFinType2 = ifelse(is.na(BsmtFinType2), "None", BsmtFinType2),
    BsmtFinSF2 = ifelse(is.na(BsmtFinSF2), 0, BsmtFinSF2),
    BsmtUnfSF = ifelse(is.na(BsmtUnfSF), 0, BsmtUnfSF),
    TotalBsmtSF = ifelse(is.na(TotalBsmtSF), 0, TotalBsmtSF)
  )
# If Fence-related variables are NA, assume there is no Fence and assign to 0
ames <- ames %>%
  mutate(
    Fence = ifelse(is.na(Fence), "None", Fence), 
  )
# If Misc-related variables are NA, assume there is no Misc features and assign to 0
ames <- ames %>%
  mutate(
    MiscFeature = ifelse(is.na(MiscFeature), "None", MiscFeature), 
  )
# If Fireplace-related variables are NA, assume there is no Fireplace and assign to 0
ames <- ames %>%
  mutate(
    FireplaceQu = ifelse(is.na(FireplaceQu), "None", FireplaceQu),
  )
# If Alley-related variables are NA, assume there is no Alley and assign to 0
ames <- ames %>%
  mutate(
    Alley = ifelse(is.na(Alley), "None", Alley),
  )

# Summarize the amount of remaining NA's by column to check what's left
colSums(is.na(ames))

# Use the dummyVars() function to convert categorical variables into dummy variables
# Then use janitor::clean_names() to clean up the column names
dummy_model <- dummyVars(~ ., data = ames)
ames_dummy <- as.data.frame(predict(dummy_model, newdata = ames))
ames_dummy <- clean_names(ames_dummy)

# Fill in all remaining na values with the mean of the column
ames_dummy <- ames_dummy %>%
  mutate(across(
    c(-sale_price, -log_sale_price),
    ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)
  ))
# Summarize the amount of remaining NA's by column
colSums(is.na(ames_dummy))

# Split the data into training and testing sets
train <- ames_dummy[ames_dummy$train == 1, ]
test <- ames_dummy[ames_dummy$train == 0, ]
# Forward Selection

# # Testing olsrr method.
# library(olsrr)
# fit2x <- lm(log_sale_price ~ . - sale_price, data = train)
# fit2y <- ols_step_forward_p(fit2x, penter = 0.15)$model
# summary(fit2y)
# defaultSummary(data.frame(pred = predict(fit2y), obs = train$log_sale_price))
# PRESS(fit2y)

# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_forwards.rds")) {
  # Load the model object from disk
  fit2 <- readRDS("Models/lm_forwards.rds")
} else {
  # Perform stepwise selection

  # Set up a parallel backend with the number of cores you want to use
  cores <- 8 # Change this to the number of cores you want to use
  cl <- makePSOCKcluster(cores)
  registerDoParallel(cl)

  set.seed(137)
  ctrl <- trainControl(
    method = "boot",
    number = 5,
    allowParallel = TRUE
  )
  fit2 <- train(log_sale_price ~ . - sale_price,
    data = train,
    method = "glmStepAIC",
    trControl = ctrl,
    direction = "forward",
    penter = 0.05 # Not Working.
  )

  # Stop the parallel backend
  stopCluster(cl)

  # Save the model object to disk
  saveRDS(fit2, "Models/lm_forwards.rds")
}

summary(fit2$finalModel)
defaultSummary(data.frame(pred = predict(fit2), obs = train$log_sale_price))
PRESS(fit2$finalModel) #Press not working with caret models
varImp(fit2$finalModel)%>%
  filter(Overall > 4) %>%
  arrange(desc(Overall))

# Output the predictions for the test set to a csv file
# fit2x <- glm(formula = formula(fit2), data = train)
forward_pred <- predict(fit2$finalModel, test)

forward_pred %>%
  data.frame() %>%
  rownames_to_column(var = "id") %>%
  mutate(SalePrice = exp(forward_pred)) %>%
  dplyr::select(id, SalePrice) %>%
  write_csv("Predictions/forward_predictions.csv")
# Backwards Selection

# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_backwards.rds")) {
  # Load the model object from disk
  fit3 <- readRDS("Models/lm_backwards.rds")
} else {
  # Perform stepwise selection

  # Set up a parallel backend with the number of cores you want to use
  cores <- 8 # Change this to the number of cores you want to use
  cl <- makePSOCKcluster(cores)
  registerDoParallel(cl)

  set.seed(137)
  fit3 <- train(log_sale_price ~ . - sale_price,
    data = train,
    method = "glmStepAIC",
    trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE),
    direction = "backward",
    penter = 0.05 # Not Working.
  )

  # Stop the parallel backend
  stopCluster(cl)
  
  # Save the model object to disk
  saveRDS(fit3, "Models/lm_backwards.rds")
}

summary(fit3$finalModel)
defaultSummary(data.frame(pred = predict(fit3), obs = train$log_sale_price))
PRESS(fit3$finalModel) # Press not working with caret models
varImp(fit3$finalModel)%>%
  filter(Overall > 4) %>%
  arrange(desc(Overall))


# Output the predictions for the test set to a csv file
# fit3x <- glm(formula = formula(fit3), data = train)
backward_pred <- predict(fit3$finalModel, newdata = test)

backward_pred %>%
  data.frame() %>%
  rownames_to_column(var = "id") %>%
  mutate(SalePrice = exp(backward_pred)) %>%
  dplyr::select(id, SalePrice) %>%
  write_csv("Predictions/backward_predictions.csv")
# Stepwise Selection

# Check if the model object exists, train if it doesn't
if (file.exists("Models/lm_stepwise.rds")) {
  # Load the model object from disk
  fit4 <- readRDS("Models/lm_stepwise.rds")
} else {
  # Perform stepwise selection

  # Set up a parallel backend with the number of cores you want to use
  cores <- 8 # Change this to the number of cores you want to use
  cl <- makePSOCKcluster(cores)
  registerDoParallel(cl)

  set.seed(137)
  fit4 <- train(log_sale_price ~ . - sale_price,
    data = train,
    method = "glmStepAIC",
    trControl = trainControl(method = "cv", number = 5, allowParallel = TRUE),
    direction = "both",
    penter = 0.05 # Not Working.
  )
  
  # Stop the parallel backend
  stopCluster(cl)
  
  # Save the model object to disk
  saveRDS(fit4, "Models/lm_stepwise.rds")
}

summary(fit4$finalModel)
defaultSummary(data.frame(pred = predict(fit4), obs = train$log_sale_price))
PRESS(fit4$finalModel) # Press not working with caret models
varImp(fit4$finalModel) %>%
  filter(Overall > 4) %>%
  arrange(desc(Overall))

# Output the predictions for the test set to a csv file
# fit4x <- glm(formula = formula(fit4), data = train)
stepwise_pred <- predict(fit4$finalModel, newdata = test)

stepwise_pred %>%
  data.frame() %>%
  rownames_to_column(var = "id") %>%
  mutate(SalePrice = exp(stepwise_pred)) %>%
  dplyr::select(id, SalePrice) %>%
  write_csv("Predictions/stepwise_predictions.csv")
# Custom Feature Selection
top10 <- varImp(fit4$finalModel) %>%
  filter(Overall > 4) %>%
  arrange(desc(Overall)) %>%
  head(10) %>%
  rownames()

form <- as.formula(paste("log_sale_price ~", paste(top10, collapse = "+")))
fit5 <- lm(form, data = train)


summary(fit5)
defaultSummary(data.frame(pred = predict(fit5), obs = train$log_sale_price))
PRESS(fit5) # Press not working with caret models
# varImp(fit5$finalModel) %>%
  # filter(Overall > 4) %>%
  # arrange(desc(Overall))

# Output the predictions for the test set to a csv file
# fit4x <- glm(formula = formula(fit4), data = train)
custom_pred <- predict(fit5, newdata = test)

custom_pred %>%
  data.frame() %>%
  rownames_to_column(var = "id") %>%
  mutate(SalePrice = exp(custom_pred)) %>%
  dplyr::select(id, SalePrice) %>%
  write_csv("Predictions/custom_predictions.csv")
# Plot the studentized residuals using base R
plot(fit2$finalModel$fitted.values, fit2$finalModel$residuals, type = "p")
plot(fit2$finalModel$residuals)
hist(fit2$finalModel$residuals, freq = FALSE)
curve(dnorm(x, mean = mean(fit2$finalModel$residuals), sd = sd(fit2$finalModel$residuals)), add = TRUE, col = "blue")
# Plot the residuals vs. the fitted values
fit2$finalModel %>%
  plot()